# Classification

The goal of classification is to find a function that separates the data into positive/negative labels. In the case of a linear classifier, this reduces to finding a set of parameters $w^\star$ such that, $$ \begin{align} w^\star &= \arg \min_w \sum_{i=1}^{N} \left[y_i\neq \text{sign} (w^\top x_i) \right] \\ &= \arg \min_w \sum_{i=1}^{N} l_{0/1} (w; x_i, y_i) \end{align}.$$ 

The problem with the $l_{0/1}$ loss, is that it is non-convex (and non-differentiable), hence other surrogate losses must be used to optimize the number of points. 

In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import ipywidgets
from ipywidgets import interact, interactive, interact_manual
import IPython
from matplotlib import rcParams
rcParams['figure.figsize'] = (10, 5)
rcParams['font.size'] = 16

import numpy as np
import matplotlib.pyplot as plt
from utilities.util import gradient_descent
from utilities.load_data import linear_separable_data, circular_separable_data
from utilities import plot_helpers 
from utilities.classifiers import Perceptron, SVM, Logistic
from utilities.regularizers import L1Regularizer, L2Regularizer

In [None]:
rcParams['figure.figsize'] = (10, 5)
rcParams['font.size'] = 16

num_points = 100 # Number of points per class
noise = 0.2 # Noise Level (needed for data generation).
TEST_FRACTION = .80
np.random.seed(42)
X, Y = linear_separable_data(num_points, noise=noise, dim=2)

fig = plt.subplot(111)
opt = {'marker': 'ro', 'label': '+', 'size': 8}
plot_helpers.plot_data(X[np.where(Y == 1)[0], 0], X[np.where(Y == 1)[0], 1], fig=fig, options=opt)
opt = {'marker': 'bs', 'label': '-', 'x_label': '$x$', 'y_label': '$y$', 'size': 8, 'legend': True}
plot_helpers.plot_data(X[np.where(Y == -1)[0], 0], X[np.where(Y == -1)[0], 1], fig=fig, options=opt)


In [None]:
rcParams['figure.figsize'] = (10, 5)
rcParams['font.size'] = 16

# Separate into train and test sets!
indexes = np.arange(0, 2 * num_points, 1)
np.random.shuffle(indexes)
num_train = int(np.ceil(2 * TEST_FRACTION * num_points))

X_train = X[indexes[:num_train]]
Y_train = Y[indexes[:num_train]]

X_test = X[indexes[num_train:]]
Y_test = Y[indexes[num_train:]]

fig = plt.subplot(111)

opt = {'marker': 'ro', 'fillstyle': 'full', 'label': '+ Train', 'size': 8}
plot_helpers.plot_data(X_train[np.where(Y_train == 1)[0], 0], X_train[np.where(Y_train == 1)[0], 1], fig=fig, options=opt)
opt = {'marker': 'bs', 'fillstyle': 'full', 'label': '- Train', 'size': 8}
plot_helpers.plot_data(X_train[np.where(Y_train == -1)[0], 0], X_train[np.where(Y_train == -1)[0], 1], fig=fig, options=opt)

opt = {'marker': 'ro', 'fillstyle': 'none', 'label': '+ Test', 'size': 8}
plot_helpers.plot_data(X_test[np.where(Y_test == 1)[0], 0], X_test[np.where(Y_test == 1)[0], 1], fig=fig, options=opt)
opt = {'marker': 'bs', 'fillstyle': 'none', 'label': '- Test', 'size': 8, 
 'x_label': '$x$', 'y_label': '$y$', 'legend': True}
plot_helpers.plot_data(X_test[np.where(Y_test == -1)[0], 0], X_test[np.where(Y_test == -1)[0], 1], fig=fig, options=opt)


### The Perceptron Algorithm

The perceptron loss is defined as: $$L(w; X, Y) = \sum_{i=1}^{N} L_p(w; x_i, y_i) = \sum_{i=1}^{N} \max \{ 0, -y_i w^\top x_i \}.$$

The loss function is continuous, but not differentialbe at $y_i w^\top x_i=0$. The subgradient, however, exists and hence (stochastic) gradient descent converges. The subgradient is:

$$ \partial L_p(w; x_i,y_i) = \left\{\begin{array}{cc} 0 & \text{if } -y_i w^\top x_i < 0 \\ -y_i x_i & \text{if } -y_i w^\top x_i > 0 \\ \left[0, -y_i x_i \right] & \text{if } -y_i w^\top x_i = 0 \end{array} \right. $$

In [None]:
rcParams['figure.figsize'] = (20, 5)
rcParams['font.size'] = 16

n_iter_widget = ipywidgets.IntSlider(value=20, min=5, max=100, step=1,
 description='Number of iterations:', style={'description_width': 'initial'},
 continuous_update=False)
batch_size_widget = ipywidgets.IntSlider(value=1, min=1, max=X_train.shape[0], step=1,
 description='Batch Size:', style={'description_width': 'initial'},
 continuous_update=False)
noise_widget = ipywidgets.FloatSlider(value=0.2, min=0, max=1, step=0.1, readout_format='.1f',
 description='Noise:', style={'description_width': 'initial'},
 continuous_update=False)

def change_learning_params(n_iter, batch_size, noise):
 np.random.seed(42)
 X, Y = linear_separable_data(num_points, noise=noise, dim=2)
 indexes = np.arange(0, 2 * num_points, 1)
 np.random.shuffle(indexes)
 num_train = int(np.ceil(2 * TEST_FRACTION * num_points))

 X_train = X[indexes[:num_train]]
 Y_train = Y[indexes[:num_train]]

 X_test = X[indexes[num_train:]]
 Y_test = Y[indexes[num_train:]]

 classifier = Perceptron(X_train, Y_train)
 classifier.load_test_data(X_test, Y_test)
 
 np.random.seed(42)
 w0 = np.random.randn(3, )

 opts = {'eta0': 1,
 'n_iter': n_iter,
 'batch_size': batch_size,
 'n_samples': X_train.shape[0],
 'algorithm': 'SGD',
 'learning_rate_scheduling': None,
 }
 try:
 trajectory, indexes = gradient_descent(w0, classifier, opts=opts)

 contour_plot = plt.subplot(121)
 error_plot = plt.subplot(122)

 opt = {'marker': 'ro', 'fillstyle': 'full', 'label': '+ Train', 'size': 8}
 plot_helpers.plot_data(X_train[np.where(Y_train == 1)[0], 0], X_train[np.where(Y_train == 1)[0], 1], fig=contour_plot, options=opt)
 opt = {'marker': 'bs', 'fillstyle': 'full', 'label': '- Train', 'size': 8}
 plot_helpers.plot_data(X_train[np.where(Y_train == -1)[0], 0], X_train[np.where(Y_train == -1)[0], 1], fig=contour_plot, options=opt)

 opt = {'marker': 'ro', 'fillstyle': 'none', 'label': '+ Test', 'size': 8}
 plot_helpers.plot_data(X_test[np.where(Y_test == 1)[0], 0], X_test[np.where(Y_test == 1)[0], 1], fig=contour_plot, options=opt)
 opt = {'marker': 'bs', 'fillstyle': 'none', 'label': '- Test', 'size': 8}
 plot_helpers.plot_data(X_test[np.where(Y_test == -1)[0], 0], X_test[np.where(Y_test == -1)[0], 1], fig=contour_plot, options=opt)

 contour_opts = {'n_points': 50, 'x_label': '$x$', 'y_label': '$y$', 'sgd_point': True, 'n_classes': 4}
 error_opts = {'epoch': 5, 'x_label': '$t$', 'y_label': 'error'}

 opts = {'contour_opts': contour_opts, 'error_opts': error_opts}
 plot_helpers.classification_progression(X, Y, trajectory, indexes, classifier, 
 contour_plot=contour_plot, error_plot=error_plot, 
 options=opts)
 except KeyboardInterrupt:
 pass
 
interact_manual(change_learning_params, n_iter=n_iter_widget, batch_size=batch_size_widget,
 noise=noise_widget);

## The SVM Algorithm

The svm loss is defined as: $$L(w; X, Y) = \sum_{i=1}^{N} L_{\text{svm}} (w; x_i, y_i) = \sum_{i=1}^{N} \max \{ 0, 1-y_i w^\top x_i \}.$$

The loss function is continuous, but not differentialbe at $y_i w^\top x_i=0$. The subgradient, however, exists and hence (stochastic) gradient descent converges. The subgradient is:

$$ \partial L_{\text{svm}}(w;x_i,y_i) = \left\{\begin{array}{cc} 0 & \text{if } 1-y_i w^\top x_i < 0 \\ -y_i x_i & \text{if } 1-y_i w^\top x_i > 0 \\ \left[0, -y_i x_i \right] & \text{if } 1-y_i w^\top x_i = 0 \end{array} \right. $$

The difference with the perceptron loss is that the SVM loss includes a loss margin. 

In [None]:
rcParams['figure.figsize'] = (10, 5)
rcParams['font.size'] = 16
reg_widget = ipywidgets.FloatSlider(value=-6, min=-6, max=3, step=0.5, readout_format='.1f',
 description='Regularization 10^:', style={'description_width': 'initial'},
 continuous_update=False)
lr_widget = ipywidgets.FloatSlider(value=1, min=1e-1, max=2, step=1 * 1e-1, readout_format='.1f', 
 description='Learning rate:', style={'description_width': 'initial'},
 continuous_update=False)
n_iter_widget = ipywidgets.IntSlider(value=20, min=5, max=100, step=1,
 description='Number of iterations:', style={'description_width': 'initial'},
 continuous_update=False)

batch_size_widget = ipywidgets.IntSlider(value=1, min=1, max=X_train.shape[0], step=1,
 description='Batch Size:', style={'description_width': 'initial'},
 continuous_update=False)
noise_widget = ipywidgets.FloatSlider(value=0.2, min=0, max=1, step=0.1, readout_format='.1f',
 description='Noise:', style={'description_width': 'initial'},
 continuous_update=False)

def change_learning_params(reg, eta0, n_iter, batch_size, noise):
 np.random.seed(42)
 X, Y = linear_separable_data(num_points, noise=noise, dim=2)
 indexes = np.arange(0, 2 * num_points, 1)
 np.random.shuffle(indexes)
 num_train = int(np.ceil(2 * TEST_FRACTION * num_points))

 X_train = X[indexes[:num_train]]
 Y_train = Y[indexes[:num_train]]

 X_test = X[indexes[num_train:]]
 Y_test = Y[indexes[num_train:]]
 
 classifier = SVM(X_train, Y_train)
 classifier.load_test_data(X_test, Y_test)
 
 regularizer = L2Regularizer(np.power(10., reg))
 np.random.seed(42)
 w0 = np.random.randn(3, )

 opts = {'eta0': eta0,
 'n_iter': n_iter,
 'batch_size': batch_size,
 'n_samples': X_train.shape[0],
 'algorithm': 'SGD',
 'learning_rate_scheduling': 'AnnealingSVM',
 'reg': regularizer.get_lambda() / batch_size
 }
 try:
 trajectory, indexes = gradient_descent(w0, classifier, regularizer, opts)

 contour_plot = plt.subplot(121)
 error_plot = plt.subplot(122)

 opt = {'marker': 'ro', 'fillstyle': 'full', 'label': '+ Train', 'size': 8}
 plot_helpers.plot_data(X_train[np.where(Y_train == 1)[0], 0], X_train[np.where(Y_train == 1)[0], 1], fig=contour_plot, options=opt)
 opt = {'marker': 'bs', 'fillstyle': 'full', 'label': '- Train', 'size': 8}
 plot_helpers.plot_data(X_train[np.where(Y_train == -1)[0], 0], X_train[np.where(Y_train == -1)[0], 1], fig=contour_plot, options=opt)

 opt = {'marker': 'ro', 'fillstyle': 'none', 'label': '+ Test', 'size': 8}
 plot_helpers.plot_data(X_test[np.where(Y_test == 1)[0], 0], X_test[np.where(Y_test == 1)[0], 1], fig=contour_plot, options=opt)
 opt = {'marker': 'bs', 'fillstyle': 'none', 'label': '- Test', 'size': 8}
 plot_helpers.plot_data(X_test[np.where(Y_test == -1)[0], 0], X_test[np.where(Y_test == -1)[0], 1], fig=contour_plot, options=opt)

 contour_opts = {'n_points': 100, 'x_label': '$x$', 'y_label': '$y$', 'sgd_point': True, 'n_classes': 4}
 error_opts = {'epoch': 5, 'x_label': '$t$', 'y_label': 'error'}

 opts = {'contour_opts': contour_opts, 'error_opts': error_opts}
 plot_helpers.classification_progression(X, Y, trajectory, indexes, classifier, 
 contour_plot=contour_plot, error_plot=error_plot, 
 options=opts)
 except KeyboardInterrupt:
 pass
interact(change_learning_params, reg=reg_widget, eta0=lr_widget, n_iter=n_iter_widget,
 batch_size=batch_size_widget, noise=noise_widget);